Land Use Regression modelling - example

Author

Adithi R Upadhya

Setting up

  • Source the functions.R file that has custom-functions and loads the required libraries.
  • Data frames shown here are restricted to 5 observations. For detailed view check in RStudio after cloning this repository.
```{r}
#| label: load-libraries
source(here::here("R", "functions.R"))
```

Coordinate reference systems

  • Define your coordinate system used for collecting data, here the Geographic coordinate system is wgs and also define your projected coordinate system, here it is UTM_proj to calculate buffer and distances in meters
  • _proj suffix denotes that data is converted to UTM_proj CRS.
  • Mention your response variable to build the model on.
```{r}
#| label: define-projections
wgs <- "+proj=longlat +datum=WGS84 +no_defs"
UTM_proj <- "+proj=utm +zone=43 +datum=WGS84" 
response_variable <- "PM2.5"
varname <- sym("PM2.5")
```

Delhi sites data

  • Read the Delhi data, please make sure all your data has a CODE column to uniquely identify the points of training / or each site of interest.
```{r}
#| label: read-data-transform
delhi_sites_df <- read.csv(here("data/Delhi_code_spatial_info", "Delhi_site_lat_long_info.csv")) %>% 
  distinct(CODE, .keep_all = T) %>% 
  mutate(lat = latitude, long = longitude)
delhi_sites_sf <- convert_sf(delhi_sites_df, wgs, "longitude", "latitude")
delhi_sites_proj <- convert_sf_proj(delhi_sites_df, wgs, UTM_proj, "longitude", "latitude") %>% 
  dplyr::select(CODE, lat, long)
delhi_sites_proj %>% 
  head() %>% 
  gt() %>% 
  tab_options(container.overflow.x = TRUE,
  container.overflow.y = TRUE)
```
CODE lat long geometry
site_5024 28.81533 77.15301 c(710101.338002067, 3189429.15533144)
site_301 28.64762 77.31581 c(726353.920741912, 3171140.02753425)
site_1420 28.69538 77.18166 c(713141.99369187, 3176186.2354132)
site_108 28.47069 77.10994 c(706571.503447317, 3151158.76696426)
site_1560 28.77620 77.05107 c(700226.381105639, 3184916.60793017)
site_104 28.72565 77.20116 c(714984.894085235, 3179576.02272205)

View Delhi sites (interactive map)

```{r}
#| label: view-delhi-sites
tmap_mode("view")
tm_basemap(leaflet::providers$Stamen.Toner, alpha = 0.2) + 
  tm_shape(delhi_sites_proj) +
  tm_dots(size = 0.1, col = "blue") 
```

Load the table of desired direction of effect of the parameters

```{r}
#| label: read-direction-of-effect-table
direction_of_effect_table <- read_csv(here("data/direction_of_effect", "parameters_pm.csv")) %>% 
  dplyr::select(param, sign, val)
```

Reading in other spatial predictors

  1. Airport coordinates in the object airport. Note, that there is one airport location because there is just one runway;
  2. Industries coordinates in the object industries;
  3. Railways in the railway object;
  4. Convert all of them to projected coordinate system.
```{r}
#| label: read-spatial-predictors
airport <- read_csv(here("data/airport","airport_point.csv")) 
airport_proj <- convert_sf_proj(airport, wgs, UTM_proj, "Longitude", "Latitude")
filters_to_check <- c("stone cutting", "foundry", "metal fabrication",
                      "rubber", "plastic", "electricals/metal fabrication/moulds", 
                      "Paper & pulp", "machines", "glass works", "charcoal", 
                      "RMC", "rice mill", "flyash bricks", "forging")
industries <- read_csv(here::here("data/industries", "Delhi_Industry_Details.csv")) %>% 
  filter(Type %in% filters_to_check)
industries <- convert_sf_proj(industries, wgs, UTM_proj, "Long", "Lat")
railway <- sf_proj(here("data/railways", "Delhi_railways.shp"), UTM_proj)
```

View Railways (interactive map)

```{r}
#| label: view-railway-data
tm_basemap(leaflet::providers$Stamen.Toner, alpha = 0.2) + 
  tm_shape(railway) +
  tm_lines() 
```

View buffer of radius 1000 m (interactive map)

```{r}
#| label: view-buffers

buffering_railway <- lur_buffer_maker(name = "rail_buffer_", buffer_len = c(500, 1000, 5000))
buffers <- buffer_points(buffering_railway, delhi_sites_proj)
buffers_railway <- mapply(FUN = sf::st_intersection,
                      x = buffers,
                      MoreArgs = list(y = railway),
                      SIMPLIFY = FALSE, 
                      USE.NAMES = TRUE)


tm_basemap(leaflet::providers$Stamen.Toner, alpha = 0.2) + 
  tm_shape(buffers$rail_buffer_1000m) + 
  tm_polygons()
```

View railway inside of buffer of radius 1000 m (interactive map)

  • From this step we calculate length of railway in that buffer.
```{r}
#| label: view-railway-buffers

tm_basemap(leaflet::providers$Stamen.Toner, alpha = 0.2) + 
  tm_shape(buffers_railway$rail_buffer_1000m) + 
  tm_lines()
```

Extracting distance variables for all CODE

```{r}
#| label: extract-spatial-predictors
delhi_sites_proj_var <- dist_variable_extraction(delhi_sites_proj, 
                                            airport_sf = airport_proj,
                                            industries = industries)
delhi_sites_proj_var %>% 
  head() %>% 
  gt() %>% 
  tab_options(container.overflow.x = TRUE,
  container.overflow.y = TRUE)
```
CODE lat long geometry inverse_distance_industries inverse_distance_airport
site_5024 28.81533 77.15301 c(710101.338002067, 3189429.15533144) 5.166996e-04 3.508913e-05
site_301 28.64762 77.31581 c(726353.920741912, 3171140.02753425) 1.200683e-04 4.241529e-05
site_1420 28.69538 77.18166 c(713141.99369187, 3176186.2354132) 2.767450e-04 5.907497e-05
site_108 28.47069 77.10994 c(706571.503447317, 3151158.76696426) 6.151745e-05 9.621531e-05
site_1560 28.77620 77.05107 c(700226.381105639, 3184916.60793017) 3.227879e-04 4.173753e-05
site_104 28.72565 77.20116 c(714984.894085235, 3179576.02272205) 1.338452e-04 4.810983e-05

Reading and extracting raster for all CODE

```{r}
#| label: extract-raster-predictors
dem_name <- "Delhi_SRTM_Elevation.tif"
aod_name <- "Delhi_AOD_map_2019.tif"

aod <- raster(here("data/AOD/interpolated", aod_name))
dem <- raster(here("data/DEM", dem_name))

delhi_sites_proj_var <- extract_raster(delhi_sites_proj_var, dem, aod, wgs, UTM_proj)
```

Adding land use and other variables

  • Here the land use, population, and the railway lengths are replaced with 0 wherever the values were NA.
  • The PM2.5 data is also added at this step.
```{r}
#| label: read-other-predictors
lulc_file_name <- "lulc_vars.csv"
pop_file_name <- "pop_vars.csv"

df_railway <- railway_fun(buffering_railway, delhi_sites_proj_var, railway)

file_lulc <- read_csv(here::here("data/landuse", lulc_file_name))
file_lulc <- file_lulc %>% 
  dplyr::select(everything(), -`...1`) 
df_lulc <- derive_lulc_as_df(file_lulc)

df_pop <- read.csv(here::here("data/population", pop_file_name), sep = ",") 

dataframe_all_params <- list(df_lulc, df_railway, df_pop) %>% 
  reduce(left_join, by = "CODE") %>%
  as.data.frame() %>% 
  dplyr::select(where(~!all(is.na(.x)))) %>% 
  mutate_if(is.numeric, ~replace_na(., 0)) 

pollutant_data <- read.csv(here("data/training_data", "yearly_avg_2018_2021.csv")) %>% 
  mutate(PM2.5 = as.numeric(as.character(pm25)),
         CODE = site) %>% 
  filter(year == 2019) %>% 
  dplyr::select(CODE, !!!response_variable)
  
dataframe_all_params <- list(dataframe_all_params, delhi_sites_proj_var, pollutant_data) %>% 
  reduce(left_join, by = "CODE") %>%
  as.data.frame() %>% 
  dplyr::select(CODE, lat, long, contains("ndvi"), everything(), 
                -geometry) 

dataframe_all_params %>% 
  head() %>% 
  gt() %>% 
  tab_options(container.overflow.x = TRUE,
  container.overflow.y = TRUE)
```
CODE lat long tree_cover_buffer_100 tree_cover_buffer_1000 tree_cover_buffer_300 tree_cover_buffer_500 tree_cover_buffer_5000 moss_lichen_buffer_100 moss_lichen_buffer_1000 moss_lichen_buffer_300 moss_lichen_buffer_500 moss_lichen_buffer_5000 shrubland_buffer_100 shrubland_buffer_1000 shrubland_buffer_300 shrubland_buffer_500 shrubland_buffer_5000 cropland_buffer_100 cropland_buffer_1000 cropland_buffer_300 cropland_buffer_500 cropland_buffer_5000 builtup_buffer_100 builtup_buffer_1000 builtup_buffer_300 builtup_buffer_500 builtup_buffer_5000 bare_land_buffer_100 bare_land_buffer_1000 bare_land_buffer_300 bare_land_buffer_500 bare_land_buffer_5000 snow_ice_buffer_100 snow_ice_buffer_1000 snow_ice_buffer_300 snow_ice_buffer_500 snow_ice_buffer_5000 per_water_bod_buffer_100 per_water_bod_buffer_1000 per_water_bod_buffer_300 per_water_bod_buffer_500 per_water_bod_buffer_5000 mang_buffer_100 mang_buffer_1000 mang_buffer_300 mang_buffer_500 mang_buffer_5000 rail_buffer_1000 rail_buffer_500 rail_buffer_5000 pop_buffer_300m pop_buffer_500m pop_buffer_1000m pop_buffer_5000m inverse_distance_industries inverse_distance_airport elevation aod PM2.5
site_103 28.55120 77.27357 6851.153 1065032.5 118329.722 357708.860 19814556 0 0 0 0 0 0.000 43356.79 3150.659 5863.825 2068209 0.000 33386.40 2707.5252 4724.1779 6409019 21596.818 1819040.7 138814.80 376540.0 39071422 2542.1478 138773.79 15940.280 30039.651 5669078 0 0 0 0 0 0 0.000 0.0000 0.00 4458857.616 0 0 0 0 0 12175.984 3977.9874 214695.16 4003.1527 11889.160 56925.505 1415292.3 3.591027e-04 5.688019e-05 14.69694 0.6815570 103.05624
site_104 28.72565 77.20116 0.000 433357.8 2914.564 23365.285 12087793 0 0 0 0 0 1656.375 68221.97 13204.398 14766.572 1113201 0.000 432785.94 10408.0634 48691.3101 15372324 9853.280 970397.1 100435.45 279256.0 37650307 19483.1537 1158726.64 151286.482 393259.063 7971233 0 0 0 0 0 0 36205.682 708.9311 15565.27 3298992.133 0 0 0 0 0 0.000 0.0000 74880.25 3325.8185 12828.520 53325.431 1585732.1 1.338452e-04 4.810983e-05 14.35270 0.7308234 NA
site_105 28.65738 77.15854 2140.187 270127.2 31687.842 57921.705 17850686 0 0 0 0 0 0.000 50556.30 0.000 0.000 3425392 0.000 29128.26 961.7447 2863.3369 1867277 27331.567 2562226.0 236799.42 682067.8 51588235 1516.8526 187613.24 9500.576 32039.236 2409701 0 0 0 0 0 0 0.000 0.0000 0.00 351502.176 0 0 0 0 0 24536.020 9692.3986 238139.78 8971.2028 27655.481 111071.197 2152789.3 1.009265e-03 8.229322e-05 15.19868 0.6665401 107.89892
site_106 28.56278 77.11801 0.000 95621.1 0.000 7968.549 18480729 0 0 0 0 0 574.129 575009.19 35171.519 112804.162 5273156 0.000 119595.20 0.0000 156.8319 7976856 30416.428 2253784.8 243771.29 651677.9 41249967 0.0000 51205.55 0.000 2270.089 4503949 0 0 0 0 0 0 4375.454 0.0000 0.00 6650.464 0 0 0 0 0 3074.308 409.0450 65323.26 3430.1072 9987.091 32906.966 643594.6 1.651350e-04 4.339879e-04 15.29706 0.6526977 94.27016
site_107 28.63965 77.14626 1800.713 437945.3 35756.062 82192.421 21574951 0 0 0 0 0 3760.201 669447.87 76783.085 213086.545 3892248 0.000 37089.67 5159.3755 9007.0879 2380191 25167.743 1897310.7 141730.31 442243.1 46974931 262.3399 57851.31 19529.567 28366.119 2326895 0 0 0 0 0 0 0.000 0.0000 0.00 343230.022 0 0 0 0 0 9373.810 781.4701 206081.70 3490.9162 11997.395 55497.930 1818232.4 4.633128e-04 1.015440e-04 14.83240 0.6981591 82.14127
site_108 28.47069 77.10994 7088.091 717781.9 52473.515 146353.332 14669721 0 0 0 0 0 14099.656 1023883.41 119949.805 314997.451 9999141 1989.706 181827.03 22672.2330 62979.9057 10835718 4414.912 875916.6 66605.94 193055.0 34622915 3395.1672 300138.90 17243.647 57489.099 7336732 0 0 0 0 0 0 0.000 0.0000 0.00 25657.298 0 0 0 0 0 0.000 0.0000 46364.72 253.1731 1061.052 6137.197 447963.4 6.151745e-05 9.621531e-05 16.61325 0.7053154 82.60988

Performing univariate regression

  • Here, we performed the univariate regression (ex: PM2.5 vs ndvi_buffer_200m) to find out the parameter which has the highest R2 and the desired direction of effect.
  • We created an empty dataframe data_univariate_summary to add the values for univariate regression.
  • We also checked for continuous repeated values in the columns using a data frame cou.
  • We check if a column has more than one non-zero or non-NA value to build the model.
  • This step is necessary as we have replaced NA values with 0 and also we need more than one value to build a linear model.
  • Check if column exists after cleaning and then in an iterative way perform univariate regression.
  • Apply add_sign_check to the data_univariate_summary / univariate regression table we generated, keep the variables where the desired direction / sign of effect is preserved.
  • Extract the highest R2 and the parameter which has the highest R2.
```{r}
#| label: univariate-regression
dataframe_all_params <- dataframe_all_params %>% 
  dplyr::select(where(~!all(is.na(.x)))) %>% 
  dplyr::select(CODE, !!!response_variable, lat, long, contains("ndvi"), everything())

col <- names(dataframe_all_params)[-c(1:2)]
dataframe_all_params[, col] <- sapply(X = dataframe_all_params[, col],
                        FUN = function(x) as.numeric(as.character(x)))

data_univariate_summary <- data.frame(matrix(ncol = 6, nrow = 0))
x <- c("value", "slope", "intercept", "R2", "f_val", "p_val")
colnames(data_univariate_summary) <- x
perc <- round(0.75 * nrow(dataframe_all_params))
for (i in col) { 
  dataframe_all_params <- dataframe_all_params[vapply(dataframe_all_params, 
                                                      function(x) length(unique(x)) > 1, 
                                                      logical(1L))]
  if(i %in% colnames(dataframe_all_params)) {
    cou <- (dataframe_all_params %>% 
              group_by(!!!syms(i)) %>% 
              summarise(n = n()) %>% 
              filter(n == max(n)))$n[1]
    if(cou < perc) {
      univariate_model <- lm(paste(response_variable, " ~ ", i), data = dataframe_all_params) 
      slope <- summary(univariate_model)$coefficients[2, 1]
      R2 <- summary(univariate_model)$adj.r.squared
      f <- summary(univariate_model)$fstatistic
      one_univardata_univariate_summary <- data.frame(value = as.character(i), slope, 
                             intercept = summary(univariate_model)$coefficients[1, 1],
                             R2, f_val = summary(univariate_model)$fstatistic[[1]], 
                             p_val = pf(f[1], f[2], f[3], lower.tail = F)[[1]])
      data_univariate_summary <- rbind(data_univariate_summary, one_univardata_univariate_summary)
     } else {
      data_univariate_summary <- data_univariate_summary
    }
}}

data_univariate_summary <- add_sign_check(data_univariate_summary, direction_of_effect_table)

data_univariate_summary <- data_univariate_summary %>%
  filter(obj != FALSE)

highest_para <- (data_univariate_summary %>% 
                   arrange(desc(R2)))$value[1]  
original_para <- highest_para

highest_r2 <- (data_univariate_summary %>% 
                   arrange(desc(R2)))$R2[1]
original_r2 <- highest_r2
```

Stepwise supervised linear regression

  • The highest R2 in the univariate model is 0.2879041 and the parameter is lat.
  • We will now perform the stepwise supervised linear regression.
```{r}
#| label: stepwise-supervised-linear-regression
dataframe_all_params <- dataframe_all_params %>% 
  dplyr::select(CODE, !!!response_variable, everything())
col_interest <- colnames(dataframe_all_params[, -c(1:2)])
dataframe_all_params[, col_interest] <- sapply(X = dataframe_all_params[, col_interest],
                                          FUN = function(x) as.numeric(as.character(x)))

c(variable_list, change_in_r2, highest_r2, df_slope_info, df_r2_info) := 
  run_lur_model(col_interest, original_para, original_r2, 
                response_variable, dataframe_all_params, direction_of_effect_table, 0.01)
```

Results of stepwise supervised linear regression

  • The change in R2 was observed to be 0.001, the multivariate model adjusted R2 now is 0.4799.
  • The parameter / variables included in this model with highest R2 is lat, builtup_buffer_500, shrubland_buffer_300, tree_cover_buffer_5000, cropland_buffer_5000.
  • The dataframe with the best models (highest R2 with right direction of effect of all included variables / parameters) at each step of adding a variable or parameter is shown below.
```{r}
#| label: view-table
df_r2_info %>% 
  arrange(desc(r2)) %>% 
  gt() %>% 
  tab_options(container.overflow.x = TRUE,
  container.overflow.y = TRUE)
```
value slope stde tval prob sig param sign val obj r2 aic rmse
lat + builtup_buffer_500 + shrubland_buffer_300 + tree_cover_buffer_5000 + cropland_buffer_5000
lat 4.733912e+01 28.5437 1.6585 0.1080 lat is.na NA TRUE 0.4799 269.2363 10.18912
builtup_buffer_500 9.240784e-06 0.0000 0.5977 0.5547 builtup > 0 TRUE 0.4799 269.2363 10.18912
shrubland_buffer_300 -1.460253e-04 0.0001 -2.1455 0.0404 * shrubland < 0 TRUE 0.4799 269.2363 10.18912
tree_cover_buffer_5000 -6.770850e-07 0.0000 -2.1246 0.0423 * tree_cover < 0 TRUE 0.4799 269.2363 10.18912
cropland_buffer_5000 -3.180859e-07 0.0000 -1.6061 0.1191 cropland < 0 TRUE 0.4799 269.2363 10.18912
lat + builtup_buffer_500 + shrubland_buffer_300 + tree_cover_buffer_5000
lat 3.804236e+01 28.6771 1.3266 0.1947 lat is.na NA TRUE 0.4525 270.2189 10.45394
builtup_buffer_500 2.613317e-05 0.0000 2.2475 0.0321 * builtup > 0 TRUE 0.4525 270.2189 10.45394
shrubland_buffer_300 -1.121162e-04 0.0001 -1.6888 0.1016 shrubland < 0 TRUE 0.4525 270.2189 10.45394
tree_cover_buffer_5000 -3.639978e-07 0.0000 -1.4072 0.1696 tree_cover < 0 TRUE 0.4525 270.2189 10.45394
lat + builtup_buffer_500 + shrubland_buffer_300
lat 6.117091e+01 23.8685 2.5628 0.0155 * lat is.na NA TRUE 0.4351 270.4562 10.61793
builtup_buffer_500 2.666320e-05 0.0000 2.2588 0.0311 * builtup > 0 TRUE 0.4351 270.4562 10.61793
shrubland_buffer_300 -9.156408e-05 0.0001 -1.3921 0.1738 shrubland < 0 TRUE 0.4351 270.4562 10.61793
lat + builtup_buffer_500
lat 7.291190e+01 22.6534 3.2186 0.0029 ** lat is.na NA TRUE 0.4186 270.5784 10.77240
builtup_buffer_500 3.248903e-05 0.0000 2.9014 0.0067 ** builtup > 0 TRUE 0.4186 270.5784 10.77240

Model building

  • Select relevant variables to make a new dataframe / df dataframe_selected_params from the orignal all parameter / variable list dataframe_all_params.
  • Once the variable list is identified and the dataframe selected, build the model (my_model_built) and look at the summary.
```{r}
#| label: model-building
others <- paste(variable_list, collapse = " + ")
dataframe_selected_params <- dataframe_all_params[ , which(names(dataframe_all_params) 
                                                           %in% c(variable_list, 
                                                                  response_variable, "CODE"))]
equ <- as.formula(paste(response_variable, others, sep = " ~ "))
my_model_built <- lm(as.formula(equ), data = dataframe_selected_params)
summary(my_model_built)
```

Call:
lm(formula = as.formula(equ), data = dataframe_selected_params)

Residuals:
     Min       1Q   Median       3Q      Max 
-16.8189  -7.4336  -0.9592   5.9710  18.3064 

Coefficients:
                         Estimate Std. Error t value Pr(>|t|)  
(Intercept)            -1.234e+03  8.176e+02  -1.510   0.1419  
lat                     4.734e+01  2.854e+01   1.658   0.1080  
builtup_buffer_500      9.241e-06  1.546e-05   0.598   0.5547  
shrubland_buffer_300   -1.460e-04  6.806e-05  -2.145   0.0404 *
tree_cover_buffer_5000 -6.771e-07  3.187e-07  -2.125   0.0423 *
cropland_buffer_5000   -3.181e-07  1.980e-07  -1.606   0.1191  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 10.19 on 29 degrees of freedom
  (5 observations deleted due to missingness)
Multiple R-squared:  0.5563,    Adjusted R-squared:  0.4799 
F-statistic: 7.273 on 5 and 29 DF,  p-value: 0.0001605

Check p-value of the model

  • Once model built, check for p-values of all the parameters in the model and remove those with p-value > than 0.1.
  • Check if the desired direction of effect for the remaining parameters is still preserved using data_to_check_for_direction_of_effect_after_cleaning.
```{r}
#| label: check-pvalue
dataframe_selected_params_p <- remove_p_value(my_model_built, dataframe_selected_params, 0.1) 
my_model_built <- create_model(dataframe_selected_params_p, response_variable)
summary(my_model_built)
data_to_check_for_direction_of_effect_after_cleaning <- vars(my_model_built, sig_star, direction_of_effect_table)
```
[1] "The columns removed due to p-value greater than "
[2] "0.1"                                             
[3] " are "                                           
[4] "(Intercept)"                                     
[5] "lat"                                             
[6] "builtup_buffer_500"                              
[7] "cropland_buffer_5000"                            

Call:
lm(formula = as.formula(eqtn), data = model_data)

Residuals:
    Min      1Q  Median      3Q     Max 
-22.823  -8.161  -2.140  10.272  25.191 

Coefficients:
                         Estimate Std. Error t value Pr(>|t|)    
(Intercept)             1.222e+02  4.043e+00  30.231  < 2e-16 ***
tree_cover_buffer_5000 -6.116e-07  2.280e-07  -2.683  0.01144 *  
shrubland_buffer_300   -2.096e-04  5.921e-05  -3.539  0.00125 ** 
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 11.3 on 32 degrees of freedom
  (5 observations deleted due to missingness)
Multiple R-squared:  0.3978,    Adjusted R-squared:  0.3602 
F-statistic: 10.57 on 2 and 32 DF,  p-value: 0.0002988

Check vif of the model

  • Next we generate variance inflation factor (VIF) of the model to check for collinearity.
  • We remove parameters / variables from the model built in the previous step with vif > 3.
  • We build the model again and check for p-value using the function remove_p_value.
  • Check if the desired direction of effect for the remaining parameters is still preserved using data_to_check_for_direction_of_effect_after_cleaning.
```{r}
#| label: check-vif
dataframe_selected_params_p_vif <- vif_function(my_model_built, dataframe_selected_params_p, 3)
my_model_built <- create_model(dataframe_selected_params_p_vif, response_variable)
summary(my_model_built)
data_to_check_for_direction_of_effect_after_cleaning <- vars(my_model_built, sig_star, direction_of_effect_table)
```
                           tree_cover_buffer_5000     shrubland_buffer_300 
"The values of vif are "       "1.00481546673693"       "1.00481546673693" 

Call:
lm(formula = as.formula(eqtn), data = model_data)

Residuals:
    Min      1Q  Median      3Q     Max 
-22.823  -8.161  -2.140  10.272  25.191 

Coefficients:
                         Estimate Std. Error t value Pr(>|t|)    
(Intercept)             1.222e+02  4.043e+00  30.231  < 2e-16 ***
tree_cover_buffer_5000 -6.116e-07  2.280e-07  -2.683  0.01144 *  
shrubland_buffer_300   -2.096e-04  5.921e-05  -3.539  0.00125 ** 
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 11.3 on 32 degrees of freedom
  (5 observations deleted due to missingness)
Multiple R-squared:  0.3978,    Adjusted R-squared:  0.3602 
F-statistic: 10.57 on 2 and 32 DF,  p-value: 0.0002988

Cook’s distance

  • Since vif of the model built did not remove parameters / variables from the model (< 3).
  • Calculate the cook’s distance to remove influential observations.
  • Plot the cook’s d values for the model built at the previous step.
```{r}
#| label: check-cooksdist
cooks_dist <- as.data.frame(cooks.distance(my_model_built)) %>% 
  dplyr::select("CD" = `cooks.distance(my_model_built)`) %>% 
  filter(CD >= 1)
plot(cooks.distance(my_model_built), ylab = "cook's distance", xlab = "observation")
no_rem <- nrow(cooks_dist)
```

  • The no of rows which have cook’s d > than 1 are 0.
  • If cook’s distance is > 1 for any observation remove that from the dataframe_selected_params_p_vif else keep all the observations.
  • Here we do not have any influential observation.
```{r}
#| label: remove-cooksdist
if(dim(cooks_dist)[1] == 0) {
  dataframe_selected_params_p_vif_cd <- dataframe_selected_params_p_vif
} else {
  dataframe_selected_params_p_vif_cd <- dataframe_selected_params_p_vif[-as.numeric(row.names(cooks_dist)), ]
  dataframe_selected_params_p_vif_cd <- dataframe_selected_params_p_vif[as.numeric(row.names(cooks_dist)), ]
  my_model_built <- create_model(dataframe_selected_params_p_vif_cd, response_variable)
  data_to_check_for_direction_of_effect_after_cleaning <- vars(my_model_built, sig_star, direction_of_effect_table)
}
```

Final model building

  • Build the final model.
  • Check p-value and use this model for validation.
```{r}
#| label: final-model
final_model_built <- create_model(dataframe_selected_params_p_vif_cd, response_variable)
summary(final_model_built)
```

Call:
lm(formula = as.formula(eqtn), data = model_data)

Residuals:
    Min      1Q  Median      3Q     Max 
-22.823  -8.161  -2.140  10.272  25.191 

Coefficients:
                         Estimate Std. Error t value Pr(>|t|)    
(Intercept)             1.222e+02  4.043e+00  30.231  < 2e-16 ***
tree_cover_buffer_5000 -6.116e-07  2.280e-07  -2.683  0.01144 *  
shrubland_buffer_300   -2.096e-04  5.921e-05  -3.539  0.00125 ** 
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 11.3 on 32 degrees of freedom
  (5 observations deleted due to missingness)
Multiple R-squared:  0.3978,    Adjusted R-squared:  0.3602 
F-statistic: 10.57 on 2 and 32 DF,  p-value: 0.0002988

Leave One Out Cross Validation (LOOCV)

  • Perform LOOCV on the final model built.
```{r}
#| label: loocv-model
others <- names(dataframe_selected_params_p_vif_cd[, !names(dataframe_selected_params_p_vif_cd)
                                                   %in% c(response_variable, "CODE")])
equ <- as.formula(paste(response_variable, paste(others, collapse = "+"), sep = " ~ "))
dataframe_selected_params_p_vif_cd_loocv <- loocv_loop(dataframe_selected_params_p_vif_cd, 
                                                       response_variable) %>% 
  dplyr::select(CODE, contains("predicted"))
mean_adj_r2 <- mean(dataframe_selected_params_p_vif_cd_loocv$predicted_loocv_r2_adj, na.rm = T)
```
  • The mean adjusted R2 for LOOCV is 0.3600115.

Spearman’s coefficient and model saving

  • Predict using the final model to calculate Spearman’s coefficient.
  • Calculate the residual.
  • Save the model in .rds format if required.
  • Save the residual in GeoPackage format or any other desired format.
```{r}
#| label: final-steps
# saveRDS(all_data, "pm2.5_2019.rds")

dataframe_selected_params_p_vif_cd$predicted_model <- predict(final_model_built, 
                                                              newdata = dataframe_selected_params_p_vif_cd)

cor.test(dataframe_selected_params_p_vif_cd$predicted_model, 
         dataframe_selected_params_p_vif_cd$PM2.5, method = "spearman")

file_sf_season <- delhi_sites_df %>% 
  dplyr::select(CODE, lat, long) %>% 
  left_join(., dataframe_selected_params_p_vif_cd, by = "CODE", suffix = c("", ".y")) %>% 
  dplyr::select(-ends_with(".y")) %>%  
  mutate(residual = !!varname - predicted_model) %>% 
  st_as_sf(., coords = c("long", "lat"), crs = wgs) 

# st_write(file_sf_season, dsn = here("pm2.5_2019.geojson"), driver = 'GeoJSON')
```

    Spearman's rank correlation rho

data:  dataframe_selected_params_p_vif_cd$predicted_model and dataframe_selected_params_p_vif_cd$PM2.5
S = 2894.7, p-value = 0.0001655
alternative hypothesis: true rho is not equal to 0
sample estimates:
      rho 
0.5945795 

Observed vs Predicted

```{r}
#| label: predicted_vs_observed
ggplot(dataframe_selected_params_p_vif_cd, aes(x = !!varname, y = predicted_model)) + 
  geom_point() + geom_abline() + theme_classic() + 
  scale_x_continuous(limits = c(0, NA)) + scale_y_continuous(limits = c(0, NA)) + 
  labs(x = "observed", y = "predicted")
```